#library(nCov2019)
library(leaflet)
library(dplyr)
library(ggplot2)
library(plotly)
library(scales)
library(xts)
library(dygraphs)
library(corrplot)
library(lubridate)
library(fmsb)
library(forecast)
COVID<-read.csv("covid_19_data.csv")
COVID_2<-read.csv("COVID19_12-Apr.csv")
Format date:
Date<-as.Date(COVID_2$Date, format="%m/%d/%y")
COVID_2$Date2<-Date
COVID_updated<-COVID_2 %>% filter(Date2==max(Date2))
leaflet(width = "100%") %>%
addProviderTiles("CartoDB.DarkMatter") %>%
setView(lng = 0, lat = 10, zoom = 1.5) %>%
addCircleMarkers(data = COVID_updated,
lng = ~ Long,
lat = ~ Lat,
radius = ~ log(Confirmed+1),
color = rgb(218/255,65/255,56/255),
fillOpacity = ~ ifelse(Confirmed > 0, 1, 0),
stroke = FALSE,
label = ~ paste(Province.State,",",Country.Region, ": ", Confirmed)
)
Current top 10 countries:
COVID_top<-COVID_2 %>% filter(Date2==max(Date2)) %>%
group_by(Country.Region) %>% summarise(Total_confirmed=sum(Confirmed)) %>%
top_n(10,Total_confirmed) %>% arrange(desc(Total_confirmed))
plot<-ggplot(data=COVID_top
, aes(x=Total_confirmed,y=reorder(Country.Region,Total_confirmed))) +
geom_bar(stat ="identity",alpha=0.8,fill="firebrick3") +
geom_text(aes(label=Total_confirmed), vjust=0.5, hjust=0.9,color="black", size=3.5) +
scale_x_continuous(labels = comma) +
labs(title = paste("Top 10 countries with confirmed cases as of ",max(COVID_2$Date2)),
x = "Confirmed cases",
y = "Country") +
theme_minimal()
ggplotly(plot,tooltip = c("x"),width=750)
Time distribution:
COVID_2_Day<- COVID_2 %>% group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed),
World_deaths=sum(Deaths),
World_recovered=sum(Recovered))
COVID_Day_confirmed_series<-xts(COVID_2_Day$World_confirmed, order.by=COVID_2_Day$Date2)
COVID_Day_deaths_series<-xts(COVID_2_Day$World_deaths, order.by=COVID_2_Day$Date2)
COVID_Day_recovered_series<-xts(COVID_2_Day$World_recovered, order.by=COVID_2_Day$Date2)
Day_summary<-cbind(COVID_Day_confirmed_series,COVID_Day_deaths_series,COVID_Day_recovered_series)
dygraph(Day_summary, main = "SARS-COV2-outbreak: Total worldwide cases",
xlab="Date", ylab="Total cases",width = 750) %>%
dySeries("COVID_Day_confirmed_series", "Total cases",drawPoints = TRUE,
pointSize = 3, color=rgb(53/255,116/255,199/255)) %>%
dySeries("COVID_Day_deaths_series", "Total deaths",drawPoints = TRUE,
pointSize = 3, color=rgb(189/255,55/255,48/255)) %>%
dySeries("COVID_Day_recovered_series", "Total recovered",drawPoints = TRUE,
pointSize = 3, color=rgb(69/255,136/255,51/255)) %>%
dyRangeSelector()
New_count<-function(x)
{
Daily_cases<-numeric(length(x))
for(i in length(x):2)
{
Daily_cases[i]=x[i] - x[i-1]
}
return(Daily_cases)
}
New_cases<-New_count(COVID_2_Day$World_confirmed)
New_deaths<-New_count(COVID_2_Day$World_deaths)
New_recovered<-New_count(COVID_2_Day$World_recovered)
COVID_New_confirmed_series<-xts(New_cases, order.by=COVID_2_Day$Date2)
COVID_New_deaths_series<-xts(New_deaths, order.by=COVID_2_Day$Date2)
COVID_New_recovered_series<-xts(New_recovered, order.by=COVID_2_Day$Date2)
New_summary<-cbind(COVID_New_confirmed_series,COVID_New_deaths_series,COVID_New_recovered_series)
dygraph(New_summary, main = "SARS-COV2-outbreak: Total worldwide cases",
xlab="Date", ylab="Novel coronavirus cases",width = 750) %>%
dySeries("COVID_New_confirmed_series", "New cases",drawPoints = TRUE,
pointSize = 3, color=rgb(53/255,116/255,199/255)) %>%
dySeries("COVID_New_deaths_series", "New deaths",drawPoints = TRUE,
pointSize = 3, color=rgb(189/255,55/255,48/255)) %>%
dySeries("COVID_New_recovered_series", "New recovered",drawPoints = TRUE,
pointSize = 3, color=rgb(69/255,136/255,51/255)) %>%
dyRangeSelector()
Team members countries total cases:
COVID_2_Day_Lebanon<- COVID_2 %>%
filter(Country.Region %in% c("Lebanon")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_Chile<- COVID_2 %>%
filter(Country.Region %in% c("Chile")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_Colombia<- COVID_2 %>%
filter(Country.Region %in% c("Colombia")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_CostaRica<- COVID_2 %>%
filter(Country.Region %in% c("Costa Rica")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_Day_series_Lebanon<-xts(COVID_2_Day_Lebanon$World_confirmed, order.by=COVID_2_Day_Lebanon$Date2)
COVID_Day_series_Chile<-xts(COVID_2_Day_Chile$World_confirmed, order.by=COVID_2_Day_Chile$Date2)
COVID_Day_series_Colombia<-xts(COVID_2_Day_Colombia$World_confirmed, order.by=COVID_2_Day_Colombia$Date2)
COVID_Day_series_CostaRica<-xts(COVID_2_Day_CostaRica$World_confirmed, order.by=COVID_2_Day_CostaRica$Date2)
Our_Countries<-cbind(COVID_Day_series_Lebanon,COVID_Day_series_Chile,COVID_Day_series_Colombia,COVID_Day_series_CostaRica)
dygraph(Our_Countries, main = "SARS-COV2-outbreak: Total cases by country", xlab="Date", ylab="Total cases",width = 750) %>%
dySeries("COVID_Day_series_Lebanon", "Lebanon",drawPoints = TRUE,
pointSize = 3, color=rgb(0,0,3/255)) %>%
dySeries("COVID_Day_series_Chile", "Chile",drawPoints = TRUE,
pointSize = 3,color=rgb(120/255,28/255,109/255)) %>%
dySeries("COVID_Day_series_Colombia", "Colombia",drawPoints = TRUE,
pointSize = 3,color=rgb(237/255,105/255,37/255)) %>%
dySeries("COVID_Day_series_CostaRica", "Costa Rica",drawPoints = TRUE,
pointSize = 3,color=rgb(204/255,197/255,126/255)) %>%
dyRangeSelector()
New_Lebanon<-New_count(COVID_2_Day_Lebanon$World_confirmed)
New_Chile<-New_count(COVID_2_Day_Chile$World_confirmed)
New_Colombia<-New_count(COVID_2_Day_Colombia$World_confirmed)
New_CostaRica<-New_count(COVID_2_Day_CostaRica$World_confirmed)
COVID_New_series_Lebanon<-xts(New_Lebanon, order.by=COVID_2_Day_Lebanon$Date2)
COVID_New_series_Chile<-xts(New_Chile, order.by=COVID_2_Day_Chile$Date2)
COVID_New_series_Colombia<-xts(New_Colombia, order.by=COVID_2_Day_Colombia$Date2)
COVID_New_series_CostaRica<-xts(New_CostaRica, order.by=COVID_2_Day_CostaRica$Date2)
Our_New_Countries<-cbind(COVID_New_series_Lebanon,COVID_New_series_Chile,COVID_New_series_Colombia,COVID_New_series_CostaRica)
dygraph(Our_New_Countries, main = "SARS-COV2-outbreak: New cases by country", xlab="Date", ylab="Total cases",width = 750) %>%
dySeries("COVID_New_series_Lebanon", "Lebanon",drawPoints = TRUE,
pointSize = 3, color=rgb(0,0,3/255)) %>%
dySeries("COVID_New_series_Chile", "Chile",drawPoints = TRUE,
pointSize = 3,color=rgb(120/255,28/255,109/255)) %>%
dySeries("COVID_New_series_Colombia", "Colombia",drawPoints = TRUE,
pointSize = 3,color=rgb(237/255,105/255,37/255)) %>%
dySeries("COVID_New_series_CostaRica", "Costa Rica",drawPoints = TRUE,
pointSize = 3,color=rgb(204/255,197/255,126/255)) %>%
dyRangeSelector()
fig <- plot_ly(COVID_updated, x = ~Confirmed, y = ~Deaths, z = ~Recovered, width=750) %>%
add_markers(text= ~Country.Region ,hoverinfo= "text",
marker = list(color=rgb(189/255,55/255,48/255))) %>%
layout(title="Confirmed cases Vs. Deaths Vs. Recovered", scene = list(
xaxis = list(title = 'Confirmed'),
yaxis = list(title = 'Deaths'),
zaxis = list(title = 'Recovered')))
fig
HDI<-read.csv("Human Development Index (HDI)_2.csv",sep=";",dec=",")
COVID_Country<-COVID_2 %>% filter(Date2==max(Date2)) %>%
group_by(Country.Region) %>% summarise(Total_confirmed=sum(Confirmed),
Total_deaths=sum(Deaths),
Total_Recovered=sum(Recovered))
Remove after parentheses:
HDI$Country_2<-gsub("\\s*\\([^\\)]+\\)","",as.character(HDI$Country))
HDI$Country_2[HDI$Country_2=="United States"]<-"US"
HDI$Country_2[HDI$Country_2=="Korea"]<-"South Korea"
Population:
Population<-read.csv("World_population.csv",sep=";",dec=",")
Remove after commma:
Population$Country_Name_2<-gsub(",.*", "", as.character(Population$Country_Name))
Population$Country_Name_2[Population$Country_Name_2=="United States"]<-"US"
Population$Country_Name_2[Population$Country_Code=="KOR"]<-"South Korea"
Population$Country_Name_2[Population$Country_Code=="CZE"]<-"Czechia"
Natural Join:
COVID_3<- COVID_Country %>% inner_join(HDI,by=c("Country.Region"="Country_2")) %>%
inner_join(Population,by=c("Country.Region"="Country_Name_2")) %>%
select(Country.Region,Total_confirmed,Total_deaths,Total_Recovered,HDI_Rank_2018,Year_2018,
Country_Code,Population_2018) %>%
mutate(Cases_million=(Total_confirmed/Population_2018)*1000000,
Recovered_percentage=(Total_Recovered/Total_confirmed)*100)
COVID_3<-COVID_3[!is.na(COVID_3$Population_2018),]
Plot the Human Development Index(HDI) Vs. the number of cases (applying a log transformation), and the proportion of recovered cases:
plot<-ggplot(data=COVID_3,aes(x=log(Cases_million),y=Year_2018,
size=Recovered_percentage,text=Country.Region)) +
geom_point(color="black",fill=rgb(237/255,105/255,37/255),shape=21,alpha=0.6) +
scale_size(range = c(3,15), name="Recovered \n percentage") +
theme_minimal() +
theme(legend.position="bottom") +
labs(title="HDI Vs. logarithmus of COVID-19 cases by million inhabitants \n and proportion of recovered",
x="ln(Cases/1M population)",
y="HDI")
ggplotly(plot,tooltip = c("text"),width=750)
COVID_numeric_1<-COVID_3 %>% mutate(Log_cases=log(Cases_million),
Death_percentage=(Total_deaths/Total_confirmed)*100) %>%
select(Log_cases,Recovered_percentage,Death_percentage,Year_2018)
corrplot(cor(COVID_numeric_1),method = "number",tl.col="black",tl.srt=15,
col=colorRampPalette(c(rgb(204/255,197/255,126/255),rgb(237/255,105/255,37/255)))(200))
Measles<-read.csv("Measles_immunization.csv",sep=";",dec=".")
Measles$Country_2<-gsub("\\s*\\([^\\)]+\\)","",as.character(Measles$Country))
Measles$Country_2[Measles$Country_2=="United States"]<-"US"
Measles$Country_2[Measles$Country_2=="Korea"]<-"South Korea"
COVID_3<- COVID_3 %>% inner_join(Measles,by=c("Country.Region"="Country_2")) %>% select(-c("Country"))
Health_expenditure<-read.csv("Health_expenditure_GDP.csv",sep=";",dec=".")
Health_expenditure$Country_2<-gsub("\\s*\\([^\\)]+\\)","",
as.character(Health_expenditure$Country))
Health_expenditure$Country_2[Health_expenditure$Country_2=="United States"]<-"US"
Health_expenditure$Country_2[Health_expenditure$Country_2=="Korea"]<-"South Korea"
COVID_3<- COVID_3 %>% inner_join(Health_expenditure,by=c("Country.Region"="Country_2")) %>% select(-c("Country"))
plot<-ggplot(data=COVID_3,aes(x=log(Cases_million),y=Expenditure_2016,
size=Recovered_percentage,text=Country.Region)) +
geom_point(color="black",fill=rgb(204/255,197/255,126/255),shape=21,alpha=0.6) +
scale_size(range = c(3,15), name="Recovered \n percentage") +
theme_minimal() +
theme(legend.position="bottom") +
labs(title="Health expenditure (% of GDP) Vs. logarithmus of COVID-19 cases by million inhabitants \n and proportion of recovered",
x="ln(Cases/1M population)",
y="% of GDP in health")
ggplotly(plot,tooltip = c("text"),width=750)
Temperature<-read.csv("GlobalLandTemperaturesByCountry.csv",sep=";")
Date_Temp<-as.Date(Temperature$dt, format="%d/%m/%Y")
Temperature$Date_Temp<-Date_Temp
Extract month, filter IQ (Jan, Feb and Mar) and obtain the average IQ temperature:
Temperature<- Temperature %>% mutate(Month=month(Temperature$Date_Temp,label =TRUE),
Year=year(Temperature$Date_Temp)) %>%
filter(Month %in% c("Jan","Feb","Mar") & Year>=2000) %>%
group_by(Country) %>% summarise(Avg_IQ_temperature=mean(AverageTemperature))
Temperature<-Temperature[!is.na(Temperature$Avg_IQ_temperature),]
Temperature$Country_2<-gsub("\\s*\\([^\\)]+\\)","",
as.character(Temperature$Country))
Temperature$Country_2[Temperature$Country_2=="United States"]<-"US"
Temperature$Country_2[Temperature$Country_2=="Korea"]<-"South Korea"
COVID_3<- COVID_3 %>% inner_join(Temperature,by=c("Country.Region"="Country_2")) %>% select(-c("Country"))
#write.csv(COVID_3,"COVID_Covariables.csv")
plot<-ggplot(data=COVID_3,aes(x=log(Cases_million),y=Avg_IQ_temperature,
size=Recovered_percentage,text=Country.Region)) +
geom_point(color="black",fill=rgb(120/255,28/255,109/255),shape=21,alpha=0.6) +
scale_size(range = c(3,15), name="Recovered \n percentage") +
theme_minimal() +
theme(legend.position="bottom") +
labs(title="Average IQ temperature Vs. logarithmus of COVID-19 cases by million inhabitants \n and proportion of recovered",
x="ln(Cases/1M population)",
y="Temperature (Celcius)")
ggplotly(plot,tooltip = c("text"),width=750)
###DTP immunization
DTP_immunization<-read.csv("Infants_lacking_immunization_DTP.csv",sep=";")
Remove after parentheses:
DTP_immunization$Country_2<-gsub("\\s*\\([^\\)]+\\)","",
as.character(DTP_immunization$Country))
DTP_immunization$Country_2[DTP_immunization$Country_2=="United States"]<-"US"
DTP_immunization$Country_2[DTP_immunization$Country_2=="Korea"]<-"South Korea"
COVID_4<- COVID_Country %>% inner_join(DTP_immunization,
by=c("Country.Region"="Country_2")) %>%
select(Country.Region,Total_confirmed,Total_deaths,Total_Recovered,
Lack_DTP_inmmunization_2018) %>%
mutate(Recovered_percentage=(Total_Recovered/Total_confirmed)*100,
Death_rate=(Total_deaths/Total_confirmed)*100)
plot<-ggplot(data=COVID_4,aes(x=Death_rate,y=Lack_DTP_inmmunization_2018,
size=Recovered_percentage,text=Country.Region)) +
geom_point(color="black",fill=rgb(120/255,28/255,109/255),shape=21,alpha=0.6) +
scale_size(range = c(3,15), name="Recovered \n percentage") +
theme_minimal() +
theme(legend.position="bottom") +
labs(title="% infants lacking DTP immunization Vs. death rate and \n proportion of recovered",
x="Novel coronavirus death rate",
y="% of infants")
ggplotly(plot,tooltip = c("text"),width=750)
COVID_4<- COVID_4 %>% inner_join(Measles,by=c("Country.Region"="Country_2")) %>% select(-c("Country"))
ggplot(COVID_4, aes(y=Measles_2018)) +
geom_boxplot(fill="dodgerblue4",outlier.shape = 21,
outlier.fill = "firebrick",alpha=0.75) +
ggtitle("Boxplot of % infants lacking measles immunization") + ylab("% of infants") +
theme_minimal()
ggplot(COVID_4, aes(Measles_2018)) +
geom_histogram(fill="dodgerblue4",bins=20,alpha=0.8) +
ggtitle("Histogram of % infants lacking measles immunization") +
xlab("% of infants") +
ylab("Count") +
theme_minimal()
plot<-ggplot(data=COVID_4,aes(x=Death_rate,y=Measles_2018,
size=Recovered_percentage,text=Country.Region)) +
geom_point(color="black",fill=rgb(237/255,105/255,37/255),shape=21,alpha=0.6) +
scale_size(range = c(3,15), name="Recovered \n percentage") +
theme_minimal() +
theme(legend.position="bottom") +
labs(title="% infants lacking measles immunization Vs. fatality rate \n and proportion of recovered",
x="Fatality rate (%)",
y="% of infants")
ggplotly(plot,tooltip = c("text"),width=750)
Transforming with ln and converting temperature as kelvin:
Mod1<-lm(log(COVID_3$Cases_million)~log(COVID_3$Year_2018)+log(COVID_3$Measles_2018)+
log(COVID_3$Expenditure_2016)+log(COVID_3$Avg_IQ_temperature+273.15))
summary(Mod1)
Call:
lm(formula = log(COVID_3$Cases_million) ~ log(COVID_3$Year_2018) +
log(COVID_3$Measles_2018) + log(COVID_3$Expenditure_2016) +
log(COVID_3$Avg_IQ_temperature + 273.15))
Residuals:
Min 1Q Median 3Q Max
-3.9505 -0.7802 -0.1227 0.8249 4.8559
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 29.73117 18.84323 1.578 0.11680
log(COVID_3$Year_2018) 7.89580 0.65925 11.977 < 2e-16 ***
log(COVID_3$Measles_2018) 0.05808 0.11861 0.490 0.62511
log(COVID_3$Expenditure_2016) 1.00664 0.31074 3.239 0.00149 **
log(COVID_3$Avg_IQ_temperature + 273.15) -4.39084 3.31671 -1.324 0.18765
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.37 on 144 degrees of freedom
Multiple R-squared: 0.7145, Adjusted R-squared: 0.7066
F-statistic: 90.1 on 4 and 144 DF, p-value: < 2.2e-16
Stepwise with AIC critertion:
Mod2<-step(Mod1,direction = "both")
Start: AIC=98.63
log(COVID_3$Cases_million) ~ log(COVID_3$Year_2018) + log(COVID_3$Measles_2018) +
log(COVID_3$Expenditure_2016) + log(COVID_3$Avg_IQ_temperature +
273.15)
Df Sum of Sq RSS AIC
- log(COVID_3$Measles_2018) 1 0.450 270.54 96.875
- log(COVID_3$Avg_IQ_temperature + 273.15) 1 3.287 273.38 98.430
<none> 270.09 98.627
- log(COVID_3$Expenditure_2016) 1 19.683 289.77 107.108
- log(COVID_3$Year_2018) 1 269.052 539.14 199.619
Step: AIC=96.87
log(COVID_3$Cases_million) ~ log(COVID_3$Year_2018) + log(COVID_3$Expenditure_2016) +
log(COVID_3$Avg_IQ_temperature + 273.15)
Df Sum of Sq RSS AIC
- log(COVID_3$Avg_IQ_temperature + 273.15) 1 3.19 273.74 96.624
<none> 270.54 96.875
+ log(COVID_3$Measles_2018) 1 0.45 270.09 98.627
- log(COVID_3$Expenditure_2016) 1 20.36 290.90 105.685
- log(COVID_3$Year_2018) 1 323.60 594.14 212.093
Step: AIC=96.62
log(COVID_3$Cases_million) ~ log(COVID_3$Year_2018) + log(COVID_3$Expenditure_2016)
Df Sum of Sq RSS AIC
<none> 273.74 96.624
+ log(COVID_3$Avg_IQ_temperature + 273.15) 1 3.19 270.54 96.875
+ log(COVID_3$Measles_2018) 1 0.36 273.38 98.430
- log(COVID_3$Expenditure_2016) 1 26.54 300.28 108.415
- log(COVID_3$Year_2018) 1 464.49 738.22 242.444
summary(Mod2)
Call:
lm(formula = log(COVID_3$Cases_million) ~ log(COVID_3$Year_2018) +
log(COVID_3$Expenditure_2016))
Residuals:
Min 1Q Median 3Q Max
-3.8665 -0.7435 0.0155 0.8653 4.8769
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.8355 0.6473 7.470 6.75e-12 ***
log(COVID_3$Year_2018) 8.1250 0.5162 15.740 < 2e-16 ***
log(COVID_3$Expenditure_2016) 1.1247 0.2989 3.763 0.000243 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.369 on 146 degrees of freedom
Multiple R-squared: 0.7107, Adjusted R-squared: 0.7067
F-statistic: 179.3 on 2 and 146 DF, p-value: < 2.2e-16
Normality of residuals:
hist(Mod2$residuals)
shapiro.test(Mod2$residuals)
Shapiro-Wilk normality test
data: Mod2$residuals
W = 0.98443, p-value = 0.09118
Prediction power: separate between training and testing:
set.seed(179819)
Sample <- sample(1:length(COVID_3$Cases_million),length(COVID_3$Cases_million)*0.20)
t.testing<- COVID_3[Sample,]
t.training<- COVID_3[-Sample,]
Transform the training and testing variables as before:
t.training<-t.training %>% mutate(Cases_million_log=log(Cases_million),HDI_log=log(Year_2018),
GDP_log=log(Expenditure_2016),
Temperature_log_kelvin=log(Avg_IQ_temperature+273.15))
t.training<-t.training[,14:17]
t.testing<-t.testing %>% mutate(Cases_million_log=log(Cases_million),HDI_log=log(Year_2018),
GDP_log=log(Expenditure_2016),
Temperature_log_kelvin=log(Avg_IQ_temperature+273.15))
t.testing<-t.testing[,14:17]
Fit the same model with training
Mod3<-lm(Cases_million_log~., data=t.training)
summary(Mod3)
Call:
lm(formula = Cases_million_log ~ ., data = t.training)
Residuals:
Min 1Q Median 3Q Max
-4.2047 -0.8363 -0.0585 0.7654 4.6216
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 38.2180 20.2009 1.892 0.06100 .
HDI_log 7.1656 0.6494 11.034 < 2e-16 ***
GDP_log 1.0266 0.3489 2.943 0.00393 **
Temperature_log_kelvin -5.9054 3.5546 -1.661 0.09934 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.351 on 116 degrees of freedom
Multiple R-squared: 0.7089, Adjusted R-squared: 0.7014
F-statistic: 94.16 on 3 and 116 DF, p-value: < 2.2e-16
Error functions:
# Residual Sum of Square (RSS)
RSS<-function(Pred,Actual) {
ss<-sum((Actual-Pred)^2)
return(ss)
}
# Residual Standard Error (RSE)
RSE<-function(Pred,Real,NumPred) {
N<-length(Real)-NumPred-1
ss<-sqrt((1/N)*RSS(Pred,Real))
return(ss)
}
# Mean Squared Error
MSE <- function(Pred,Actual) {
N<-length(Actual)
ss<-(1/N)*RSS(Pred,Actual)
return(ss)
}
# Relative error
RelativeError<-function(Pred,Actual) {
ss<-sum(abs(Actual-Pred))/sum(abs(Actual))
return(ss)
}
Prediction:
Pred<-predict(Mod3,t.testing)
Errors:
RSS_Mod3<-RSS(Pred,t.testing$Cases_million_log)
RSE_Mod3<-RSE(Pred,t.testing$Cases_million_log,dim(t.testing)[2]-1)
MSE_Mod3<-MSE(Pred,t.testing$Cases_million_log)
RelativeError_Mod3<-RelativeError(Pred,t.testing$Cases_million_log)
Mod3_Errors<-c(RSS_Mod3,RSE_Mod3,MSE_Mod3,RelativeError_Mod3)
Now, a model without temperature:
t.training <- t.training %>% select(-Temperature_log_kelvin)
t.testing <- t.testing %>% select(-Temperature_log_kelvin)
Mod4<-lm(Cases_million_log~., data=t.training)
summary(Mod4)
Call:
lm(formula = Cases_million_log ~ ., data = t.training)
Residuals:
Min 1Q Median 3Q Max
-4.1277 -0.8134 0.0036 0.7896 4.6676
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.6788 0.7362 6.356 4.14e-09 ***
HDI_log 7.6919 0.5711 13.468 < 2e-16 ***
GDP_log 1.1737 0.3400 3.452 0.000775 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.361 on 117 degrees of freedom
Multiple R-squared: 0.702, Adjusted R-squared: 0.6969
F-statistic: 137.8 on 2 and 117 DF, p-value: < 2.2e-16
Prediction:
Pred<-predict(Mod4,t.testing)
Errors:
RSS_Mod4<-RSS(Pred,t.testing$Cases_million_log)
RSE_Mod4<-RSE(Pred,t.testing$Cases_million_log,dim(t.testing)[2]-1)
MSE_Mod4<-MSE(Pred,t.testing$Cases_million_log)
RelativeError_Mod4<-RelativeError(Pred,t.testing$Cases_million_log)
Mod4_Errors<-c(RSS_Mod4,RSE_Mod4,MSE_Mod4,RelativeError_Mod4)
Create a radarplot:
Errors<-rbind(Mod3_Errors,Mod4_Errors)
rownames(Errors)<-c("Model with temperature","Model without temperature")
colnames(Errors)<-c("Residual Sum of Square","Residual Standard Error","Mean Squared Error","Relative error")
Errors<-as.data.frame(Errors)
maximum<-apply(Errors,2,max)
minimum<-apply(Errors,2,min)
Errors<-rbind(minimum,Errors)
Errors<-rbind(maximum,Errors)
radarchart(Errors,maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8,cglcol="gray67",
pcol=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"),
plty=1,
plwd=3,
title="Error comparison")
legend <-legend(1.5,1, legend=c("With temperature","Without temperature"),
seg.len=-1.4,
title="Errors",
pch=21,
bty="n" ,lwd=3, y.intersp=1, horiz=FALSE,
col=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"))
Model with temperature
CostaRica_Temp<-read.csv("CostaRica_Temperature.csv",sep=";")
CostaRica_Temp$Date<-as.Date(CostaRica_Temp$Date,format="%d/%m/%Y")
COVID_2_Day_CostaRica_Temp<-COVID_2_Day_CostaRica %>%
inner_join(CostaRica_Temp,by=c("Date2"="Date"))
COVID_Day_series_CostaRica_Temp<-xts(COVID_2_Day_CostaRica_Temp$World_confirmed, order.by=COVID_2_Day_CostaRica_Temp$Date2)
COVID_series_CostaRica_Temp_Train<-COVID_Day_series_CostaRica_Temp[1:64] #Until March 25th
COVID_series_CostaRica_Temp_Test<-COVID_Day_series_CostaRica_Temp[65:length(COVID_Day_series_CostaRica_Temp)] #From March 26th onwards
Get the lagged difference:
plot(diff(COVID_series_CostaRica_Temp_Train),type="l",main="1st ")
Correlograms of first diference:
tsdisplay(diff(COVID_series_CostaRica_Temp_Train))
Arima model: ARIMA(1,2,1)
#Auto Arima for Costa Rica:
ARIMA1_CostaRica<-auto.arima(COVID_series_CostaRica_Temp_Train,
xreg = COVID_2_Day_CostaRica_Temp$Temperature_CostaRica[1:64])
summary(ARIMA1_CostaRica)
Series: COVID_series_CostaRica_Temp_Train
Regression with ARIMA(1,2,1) errors
Coefficients:
ar1 ma1 xreg
-0.7967 0.4328 -0.2134
s.e. 0.1462 0.1840 0.1159
sigma^2 estimated as 7.227: log likelihood=-147.93
AIC=303.86 AICc=304.57 BIC=312.37
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.4281342 2.581091 1.397242 NaN Inf 0.4379416 -0.009839234
Predictive error functions:
#Relative error
RE <- function(Fore,Actual) {
return(sum(abs(Fore-Actual))/abs(sum(Actual)))
}
#MAPE
MAPE<-function(Fore,Actual){
return(
mean(abs(Actual-Fore)/abs(Actual))*100
)
}
# mean squared error (MSE)
MSE<-function(Fore,Actual) {
N<-length(Actual)
ss<-sum((Actual-Fore)^2)
return((1/N)*ss)
}
#PFA
PFA <- function(Fore,Actual) {
Total<-0
N<-length(Fore)
for(i in 1:N) {
if(Fore[i]>=Actual[i])
Total<-Total+1
}
return(Total/N)
}
Forecast prediction to compare
Test_CostaRica_Temp<-COVID_2_Day_CostaRica_Temp$Temperature_CostaRica[65:length(COVID_Day_series_CostaRica_Temp)]
P_CRI_1<-forecast(ARIMA1_CostaRica,xreg = Test_CostaRica_Temp)
Calculate errors:
RE_CRI_1<-RE(P_CRI_1$mean,as.vector(COVID_series_CostaRica_Temp_Test))
MAPE_CRI_1<-MAPE(P_CRI_1$mean,as.vector(COVID_series_CostaRica_Temp_Test))
MSE_CRI_1<-MSE(P_CRI_1$mean,as.vector(COVID_series_CostaRica_Temp_Test))
PFA_CRI_1<-PFA(P_CRI_1$mean,as.vector(COVID_series_CostaRica_Temp_Test))
Errors.CRI_1<-c(RE_CRI_1,MAPE_CRI_1,MSE_CRI_1,PFA_CRI_1)
Errors.CRI_1
[1] 0.02864855 3.31029459 186.64088433 0.38888889
Manual MAPE:
dd<-data.frame(Actual=as.vector(COVID_series_CostaRica_Temp_Test),
Fore=P_CRI_1$mean
)
dd<-dd %>% mutate(Abs_Per_Error=abs(Actual-Fore)/abs(Actual))
mean(dd$Abs_Per_Error)*100
[1] 3.310295
Model without temperature
ARIMA(1,2,0)
#Auto Arima for Costa Rica:
ARIMA2_CostaRica<-auto.arima(COVID_series_CostaRica_Temp_Train)
summary(ARIMA2_CostaRica)
Series: COVID_series_CostaRica_Temp_Train
ARIMA(1,2,0)
Coefficients:
ar1
-0.4701
s.e. 0.1133
sigma^2 estimated as 7.607: log likelihood=-150.5
AIC=304.99 AICc=305.19 BIC=309.24
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.5145597 2.692615 1.200452 7.86228 21.76863 0.3762612 0.04160954
Forecast prediction to compare
P_CRI_2<-forecast(ARIMA2_CostaRica,h=length(COVID_series_CostaRica_Temp_Test))
Calculate errors:
RE_CRI_2<-RE(P_CRI_2$mean,as.vector(COVID_series_CostaRica_Temp_Test))
MAPE_CRI_2<-MAPE(P_CRI_2$mean,as.vector(COVID_series_CostaRica_Temp_Test))
MSE_CRI_2<-MSE(P_CRI_2$mean,as.vector(COVID_series_CostaRica_Temp_Test))
PFA_CRI_2<-PFA(P_CRI_2$mean,as.vector(COVID_series_CostaRica_Temp_Test))
Errors.CRI_2<-c(RE_CRI_2,MAPE_CRI_2,MSE_CRI_2,PFA_CRI_2)
Errors.CRI_2
[1] 0.02795849 3.31555299 189.64942824 0.33333333
Errors<-rbind(Errors.CRI_1,Errors.CRI_2)
rownames(Errors)<-c("ARIMAX(1,2,1)","ARIMA(1,2,0)")
colnames(Errors)<-c("Relative error","Mean Abs % error","Mean Squared Error","PFA")
Errors<-as.data.frame(Errors)
maximum<-apply(Errors,2,max)
minimum<-apply(Errors,2,min)
Errors<-rbind(minimum,Errors)
Errors<-rbind(maximum,Errors)
radarchart(Errors,maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8,cglcol="gray67",
pcol=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"),
plty=1,
plwd=3,
title="Error comparison")
legend <-legend(1.5,1, legend=c("ARIMAX(1,2,1)","ARIMA(1,2,0)"),
seg.len=-1.4,
title="Errors",
pch=21,
bty="n" ,lwd=3, y.intersp=1, horiz=FALSE,
col=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"))
Conclusion: Keep model with temperature
Make model with the overall series
#Auto Arima for Costa Rica:
ARIMA_Final_CostaRica<-auto.arima(COVID_Day_series_CostaRica_Temp,
xreg = COVID_2_Day_CostaRica_Temp$Temperature_CostaRica)
summary(ARIMA_Final_CostaRica)
Series: COVID_Day_series_CostaRica_Temp
Regression with ARIMA(1,1,1) errors
Coefficients:
ar1 ma1 xreg
0.9790 -0.3878 -0.1779
s.e. 0.0213 0.1369 0.1813
sigma^2 estimated as 18.82: log likelihood=-233.45
AIC=474.91 AICc=475.44 BIC=484.49
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.6041916 4.230615 2.352548 NaN Inf 0.3202629 0.03640188
Final forecast:
Future_CostaRica_Temp<-CostaRica_Temp$Temperature_CostaRica[CostaRica_Temp$Date>max(COVID_2$Date2)]
P_CRI_Final<-forecast(ARIMA_Final_CostaRica,xreg = Future_CostaRica_Temp)
Low_lim_CRI<-data.frame(P_CRI_Final$lower)[,2]
Upp_lim_CRI<-data.frame(P_CRI_Final$upper)[,2]
For making the plot:
##Data periods
per_1 <- as.Date(as.character(COVID_2_Day_CostaRica_Temp$Date2))
per_2 <- seq(as.Date(max(COVID_2_Day_CostaRica_Temp$Date2)+1,format="%Y-%m-%d"), as.Date("2020-04-30",format="%Y-%m-%d"),"1 day")
# Merge forecast + actuals
data <- xts(COVID_Day_series_CostaRica_Temp,order.by=per_1)
dataNA <- rep(NA, length(data))
A <- cbind(data,dataNA,dataNA,dataNA)
Low_lim_CRI <- xts(Low_lim_CRI,order.by=per_2)
Forecast_CRI <- xts(P_CRI_Final$mean,order.by=per_2)
Upp_lim_CRI <- xts(Upp_lim_CRI,order.by=per_2)
predNA <- rep(NA, length(Forecast_CRI))
B <- cbind(predNA, Low_lim_CRI, Forecast_CRI, Upp_lim_CRI)
all_series_CRI <- data.frame(rbind(as.matrix(A),as.matrix(B)))
colnames(all_series_CRI) <- c('Actual', 'Lower_limit', 'Forecast', 'Upper_limit')
dygraph(all_series_CRI, main="SARS-COV2-outbreak: Total Costa Rica cases",xlab="Date", ylab="Novel coronavirus cases",width = 750)%>%
dySeries(c('Lower_limit', 'Forecast', 'Upper_limit'),label="Forecast",strokeWidth=2,
drawPoints = TRUE, pointSize = 2, color=rgb(189/255,44/255,47/255)) %>%
dySeries("Actual",drawPoints = TRUE, strokeWidth=2, pointSize = 2,
color=rgb(10/255,44/255,119/255)) %>%
dyRangeSelector()
NA
Model with temperature
Mexico_Temp<-read.csv("Mexico_Temperature.csv",sep=";")
Mexico_Temp$Date<-as.Date(Mexico_Temp$Date,format="%d/%m/%Y")
COVID_2_Day_Mexico<- COVID_2 %>%
filter(Country.Region %in% c("Mexico")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_Mexico_Temp<-COVID_2_Day_Mexico %>%
inner_join(Mexico_Temp,by=c("Date2"="Date"))
COVID_Day_series_Mexico_Temp<-xts(COVID_2_Day_Mexico_Temp$World_confirmed, order.by=COVID_2_Day_Mexico_Temp$Date2)
COVID_series_Mexico_Train<-COVID_Day_series_Mexico_Temp[1:74] #Until April 4th
COVID_series_Mexico_Test<-COVID_Day_series_Mexico_Temp[75:length(COVID_Day_series_Mexico_Temp)] #From April 5th onwards
Get the lagged difference:
plot(diff(COVID_series_Mexico_Train),type="l",main="1st ")
Correlograms of first diference:
tsdisplay(diff(COVID_series_Mexico_Train))
Arima model: ARIMA(2,2,3)
#Auto Arima for Mexico:
#ARIMA1_Mexico<-auto.arima(COVID_series_Mexico_Train,
#xreg = COVID_2_Day_Mexico_Temp$Temperature_Mexico[1:74])
ARIMA1_Mexico<-Arima(COVID_series_Mexico_Train,order=c(2,2,3),xreg=COVID_2_Day_Mexico_Temp$Temperature_Mexico[1:74])
summary(ARIMA1_Mexico)
Series: COVID_series_Mexico_Train
Regression with ARIMA(2,2,3) errors
Coefficients:
ar1 ar2 ma1 ma2 ma3 xreg
-0.9027 -0.7333 1.1212 0.8938 0.6734 0.0623
s.e. 0.1121 0.1592 0.1093 0.2027 0.1153 0.2567
sigma^2 estimated as 123.3: log likelihood=-273.92
AIC=561.84 AICc=563.59 BIC=577.78
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 1.716467 10.48796 5.177678 NaN Inf 0.2239162 -0.1047828
Forecast prediction to compare
Test_Mexico_Temp<-COVID_2_Day_Mexico_Temp$Temperature_Mexico[75:length(COVID_Day_series_Mexico_Temp)]
P_MEX_1<-forecast(ARIMA1_Mexico,xreg = Test_Mexico_Temp)
Calculate errors:
RE_MEX_1<-RE(P_MEX_1$mean,as.vector(COVID_series_Mexico_Test))
MAPE_MEX_1<-MAPE(P_MEX_1$mean,as.vector(COVID_series_Mexico_Test))
MSE_MEX_1<-MSE(P_MEX_1$mean,as.vector(COVID_series_Mexico_Test))
PFA_MEX_1<-PFA(P_MEX_1$mean,as.vector(COVID_series_Mexico_Test))
Errors.MEX_1<-c(RE_MEX_1,MAPE_MEX_1,MSE_MEX_1,PFA_MEX_1)
Errors.MEX_1
[1] 1.685421e-01 1.468591e+01 3.878720e+05 0.000000e+00
Manual MAPE:
dd<-data.frame(Actual=as.vector(COVID_series_Mexico_Test),
Fore=P_MEX_1$mean
)
dd<-dd %>% mutate(Abs_Per_Error=abs(Actual-Fore)/abs(Actual))
mean(dd$Abs_Per_Error)*100
[1] 14.68591
Model without temperature
ARIMA(2,2,4)
#Auto Arima for Costa Rica:
ARIMA2_Mexico<-auto.arima(COVID_series_Mexico_Train)
summary(ARIMA2_Mexico)
Series: COVID_series_Mexico_Train
ARIMA(2,2,4)
Coefficients:
ar1 ar2 ma1 ma2 ma3 ma4
-0.7232 -0.7137 0.8661 0.6669 0.4019 -0.3260
s.e. 0.1415 0.1688 0.1697 0.2135 0.1529 0.1463
sigma^2 estimated as 116.5: log likelihood=-272.23
AIC=558.45 AICc=560.2 BIC=574.39
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 2.187346 10.19436 4.88144 4.696633 18.33364 0.2111049 -0.05173888
Forecast prediction to compare
P_MEX_2<-forecast(ARIMA2_Mexico,h=length(COVID_series_Mexico_Test))
Calculate errors:
RE_MEX_2<-RE(P_MEX_2$mean,as.vector(COVID_series_Mexico_Test))
MAPE_MEX_2<-MAPE(P_MEX_2$mean,as.vector(COVID_series_Mexico_Test))
MSE_MEX_2<-MSE(P_MEX_2$mean,as.vector(COVID_series_Mexico_Test))
PFA_MEX_2<-PFA(P_MEX_2$mean,as.vector(COVID_series_Mexico_Test))
Errors.MEX_2<-c(RE_MEX_2,MAPE_MEX_2,MSE_MEX_2,PFA_MEX_2)
Errors.MEX_2
[1] 1.728869e-01 1.505426e+01 4.086921e+05 0.000000e+00
Errors<-rbind(Errors.MEX_1,Errors.MEX_2)
rownames(Errors)<-c("ARIMAX(2,2,3)","ARIMA(2,2,4)")
colnames(Errors)<-c("Relative error","Mean Abs % error","Mean Squared Error","PFA")
Errors<-as.data.frame(Errors)
maximum<-apply(Errors,2,max)
minimum<-apply(Errors,2,min)
Errors<-rbind(minimum,Errors)
Errors<-rbind(maximum,Errors)
radarchart(Errors,maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8,cglcol="gray67",
pcol=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"),
plty=1,
plwd=3,
title="Error comparison")
legend <-legend(1.5,1, legend=c("ARIMAX(2,2,3)","ARIMA(2,2,4)"),
seg.len=-1.4,
title="Errors",
pch=21,
bty="n" ,lwd=3, y.intersp=1, horiz=FALSE,
col=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"))
Conclusion: Keep model with temperature
Make model with the overall series
#Auto Arima for Costa Rica:
ARIMA_Final_Mexico<-auto.arima(COVID_Day_series_Mexico_Temp,
xreg = COVID_2_Day_Mexico_Temp$Temperature_Mexico)
summary(ARIMA_Final_Mexico)
Series: COVID_Day_series_Mexico_Temp
Regression with ARIMA(2,2,1) errors
Coefficients:
ar1 ar2 ma1 xreg
0.4741 0.5051 -0.8911 -0.3172
s.e. 0.1029 0.1002 0.0545 1.0035
sigma^2 estimated as 618.6: log likelihood=-369.16
AIC=748.32 AICc=749.13 BIC=760.23
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 3.617958 23.94469 10.92492 NaN Inf 0.209746 -0.08080313
Final forecast:
Future_Mexico_Temp<-Mexico_Temp$Temperature_Mexico[Mexico_Temp$Date>max(COVID_2$Date2)]
P_MEX_Final<-forecast(ARIMA_Final_Mexico,xreg = Future_Mexico_Temp)
Low_lim_MEX<-data.frame(P_MEX_Final$lower)[,2]
Upp_lim_MEX<-data.frame(P_MEX_Final$upper)[,2]
For making the plot:
##Data periods
per_1 <- as.Date(as.character(COVID_2_Day_Mexico_Temp$Date2))
per_2 <- seq(as.Date(max(COVID_2_Day_Mexico_Temp$Date2)+1,format="%Y-%m-%d"), as.Date("2020-04-30",format="%Y-%m-%d"),"1 day")
# Merge forecast + actuals
data <- xts(COVID_Day_series_Mexico_Temp,order.by=per_1)
dataNA <- rep(NA, length(data))
A <- cbind(data,dataNA,dataNA,dataNA)
Low_lim_MEX <- xts(Low_lim_MEX,order.by=per_2)
Forecast_MEX <- xts(P_MEX_Final$mean,order.by=per_2)
Upp_lim_MEX <- xts(Upp_lim_MEX,order.by=per_2)
predNA <- rep(NA, length(Forecast_MEX))
B <- cbind(predNA, Low_lim_MEX, Forecast_MEX, Upp_lim_MEX)
all_series_MEX <- data.frame(rbind(as.matrix(A),as.matrix(B)))
colnames(all_series_MEX) <- c('Actual', 'Lower_limit', 'Forecast', 'Upper_limit')
dygraph(all_series_MEX, main="SARS-COV2-outbreak: Total Mexico cases",xlab="Date", ylab="Novel coronavirus cases",width = 750)%>%
dySeries(c('Lower_limit', 'Forecast', 'Upper_limit'),label="Forecast",strokeWidth=2,
drawPoints = TRUE, pointSize = 2, color=rgb(189/255,44/255,47/255)) %>%
dySeries("Actual",drawPoints = TRUE, strokeWidth=2, pointSize = 2,
color=rgb(43/255,102/255,73/255)) %>%
dyRangeSelector()
NA
Model with temperature
Italy_Temp<-read.csv("Italy_Temperature.csv",sep=";")
Italy_Temp$Date<-as.Date(Italy_Temp$Date,format="%d/%m/%Y")
COVID_2_Day_Italy<- COVID_2 %>%
filter(Country.Region %in% c("Italy")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_Italy_Temp<-COVID_2_Day_Italy %>%
inner_join(Italy_Temp,by=c("Date2"="Date"))
COVID_Day_series_Italy_Temp<-xts(COVID_2_Day_Italy_Temp$World_confirmed, order.by=COVID_2_Day_Italy_Temp$Date2)
COVID_Day_series_Italy_Train<-COVID_Day_series_Italy_Temp[1:64] #Until March 25th
COVID_Day_series_Italy_Test<-COVID_Day_series_Italy_Temp[65:length(COVID_Day_series_Italy_Temp)] #From March 26th onwards
Get the lagged difference:
plot(diff(COVID_Day_series_Italy_Train),type="l",main="1st ")
Correlograms of first diference:
tsdisplay(diff(COVID_Day_series_Italy_Train))
Arima model: ARIMA(1,2,0)
#Auto Arima for Italy:
ARIMA1_Italy<-auto.arima(COVID_Day_series_Italy_Train,
xreg = COVID_2_Day_Italy_Temp$Temperature_Italy[1:64])
#ARIMA1_Mexico<-Arima(COVID_series_Mexico_Train,order=c(2,2,3),xreg=COVID_2_Day_Mexico_Temp$Temperature_Mexico[1:74])
summary(ARIMA1_Italy)
Series: COVID_Day_series_Italy_Train
Regression with ARIMA(1,2,0) errors
Coefficients:
ar1 xreg
-0.5094 21.4065
s.e. 0.1119 20.2612
sigma^2 estimated as 489396: log likelihood=-493.24
AIC=992.47 AICc=992.89 BIC=998.85
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 123.6808 677.3545 335.2238 NaN Inf 0.2839123 -0.005148955
Forecast prediction to compare
Test_Italy_Temp<-COVID_2_Day_Italy_Temp$Temperature_Italy[65:length(COVID_Day_series_Italy_Temp)]
P_ITA_1<-forecast(ARIMA1_Italy,xreg = Test_Italy_Temp)
Calculate errors:
RE_ITA_1<-RE(P_ITA_1$mean,as.vector(COVID_Day_series_Italy_Test))
MAPE_ITA_1<-MAPE(P_ITA_1$mean,as.vector(COVID_Day_series_Italy_Test))
MSE_ITA_1<-MSE(P_ITA_1$mean,as.vector(COVID_Day_series_Italy_Test))
PFA_ITA_1<-PFA(P_ITA_1$mean,as.vector(COVID_Day_series_Italy_Test))
Errors.ITA_1<-c(RE_ITA_1,MAPE_ITA_1,MSE_ITA_1,PFA_ITA_1)
Errors.ITA_1
[1] 3.837737e-02 3.421402e+00 3.904275e+07 7.222222e-01
Manual MAPE:
dd<-data.frame(Actual=as.vector(COVID_Day_series_Italy_Test),
Fore=P_ITA_1$mean
)
dd<-dd %>% mutate(Abs_Per_Error=abs(Actual-Fore)/abs(Actual))
mean(dd$Abs_Per_Error)*100
[1] 3.421402
Model without temperature
ARIMA(1,2,0)
#Auto Arima for Italy:
ARIMA2_Italy<-auto.arima(COVID_Day_series_Italy_Train)
summary(ARIMA2_Italy)
Series: COVID_Day_series_Italy_Train
ARIMA(1,2,0)
Coefficients:
ar1
-0.5414
s.e. 0.1046
sigma^2 estimated as 489700: log likelihood=-493.79
AIC=991.58 AICc=991.79 BIC=995.84
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 125.8084 683.1877 295.442 5.335548 11.24576 0.2502197 -0.02348722
Forecast prediction to compare
P_ITA_2<-forecast(ARIMA2_Italy,h=length(COVID_Day_series_Italy_Test))
Calculate errors:
RE_ITA_2<-RE(P_ITA_2$mean,as.vector(COVID_Day_series_Italy_Test))
MAPE_ITA_2<-MAPE(P_ITA_2$mean,as.vector(COVID_Day_series_Italy_Test))
MSE_ITA_2<-MSE(P_ITA_2$mean,as.vector(COVID_Day_series_Italy_Test))
PFA_ITA_2<-PFA(P_ITA_2$mean,as.vector(COVID_Day_series_Italy_Test))
Errors.ITA_2<-c(RE_ITA_2,MAPE_ITA_2,MSE_ITA_2,PFA_ITA_2)
Errors.ITA_2
[1] 3.608941e-02 3.235491e+00 3.442908e+07 6.666667e-01
Errors<-rbind(Errors.ITA_1,Errors.ITA_2)
rownames(Errors)<-c("ARIMAX(1,2,0)","ARIMA(1,2,0)")
colnames(Errors)<-c("Relative error","Mean Abs % error","Mean Squared Error","PFA")
Errors<-as.data.frame(Errors)
maximum<-apply(Errors,2,max)
minimum<-apply(Errors,2,min)
Errors<-rbind(minimum,Errors)
Errors<-rbind(maximum,Errors)
radarchart(Errors,maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8,cglcol="gray67",
pcol=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"),
plty=1,
plwd=3,
title="Error comparison")
legend <-legend(1.5,1, legend=c("ARIMAX(1,2,0)","ARIMA(1,2,0)"),
seg.len=-1.4,
title="Errors",
pch=21,
bty="n" ,lwd=3, y.intersp=1, horiz=FALSE,
col=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"))
Conclusion: Keep model without temperature
Make model with the overall series
#Auto Arima for Italy:
ARIMA_Final_Italy<-auto.arima(COVID_Day_series_Italy_Temp)
summary(ARIMA_Final_Italy)
Series: COVID_Day_series_Italy_Temp
ARIMA(1,2,0)
Coefficients:
ar1
-0.4727
s.e. 0.0977
sigma^2 estimated as 481686: log likelihood=-636.54
AIC=1277.08 AICc=1277.24 BIC=1281.84
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 76.96003 681.2217 345.1584 3.887578 8.51579 0.1788008 0.01037227
Final forecast:
Future_Italy_Temp<-Italy_Temp$Temperature_Italy[Italy_Temp$Date>max(COVID_2$Date2)]
P_ITA_Final<-forecast(ARIMA_Final_Italy,h=length(Future_Italy_Temp))
Low_lim_ITA<-data.frame(P_ITA_Final$lower)[,2]
Upp_lim_ITA<-data.frame(P_ITA_Final$upper)[,2]
For making the plot:
##Data periods
per_1 <- as.Date(as.character(COVID_2_Day_Italy_Temp$Date2))
per_2 <- seq(as.Date(max(COVID_2_Day_Italy_Temp$Date2)+1,format="%Y-%m-%d"), as.Date("2020-04-30",format="%Y-%m-%d"),"1 day")
# Merge forecast + actuals
data <- xts(COVID_Day_series_Italy_Temp,order.by=per_1)
dataNA <- rep(NA, length(data))
A <- cbind(data,dataNA,dataNA,dataNA)
Low_lim_ITA <- xts(Low_lim_ITA,order.by=per_2)
Forecast_ITA <- xts(P_ITA_Final$mean,order.by=per_2)
Upp_lim_ITA <- xts(Upp_lim_ITA,order.by=per_2)
predNA <- rep(NA, length(Forecast_ITA))
B <- cbind(predNA, Low_lim_ITA, Forecast_ITA, Upp_lim_ITA)
all_series_ITA <- data.frame(rbind(as.matrix(A),as.matrix(B)))
colnames(all_series_ITA) <- c('Actual', 'Lower_limit', 'Forecast', 'Upper_limit')
dygraph(all_series_ITA, main="SARS-COV2-outbreak: Total Italy cases",xlab="Date", ylab="Novel coronavirus cases",width = 750)%>%
dySeries(c('Lower_limit', 'Forecast', 'Upper_limit'),label="Forecast",strokeWidth=2,
drawPoints = TRUE, pointSize = 2, color=rgb(190/255,59/255,61/255)) %>%
dySeries("Actual",drawPoints = TRUE, strokeWidth=2, pointSize = 2,
color=rgb(64/255,143/255,78/255)) %>%
dyRangeSelector()
NA
Model with temperature
Lebanon_Temp<-read.csv("Lebanon_Temperature.csv",sep=";")
Lebanon_Temp$Date<-as.Date(Lebanon_Temp$Date,format="%d/%m/%Y")
COVID_2_Day_Lebanon<- COVID_2 %>%
filter(Country.Region %in% c("Lebanon")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_Lebanon_Temp<-COVID_2_Day_Lebanon %>%
inner_join(Lebanon_Temp,by=c("Date2"="Date"))
COVID_Day_series_Lebanon_Temp<-xts(COVID_2_Day_Lebanon_Temp$World_confirmed, order.by=COVID_2_Day_Lebanon_Temp$Date2)
COVID_series_Lebanon_Train<-COVID_Day_series_Lebanon_Temp[1:74] #Until April 4th
COVID_series_Lebanon_Test<-COVID_Day_series_Lebanon_Temp[75:length(COVID_Day_series_Lebanon_Temp)] #From April 4th onwards
Get the lagged difference:
plot(diff(COVID_series_Lebanon_Train),type="l",main="1st ")
Correlograms of first diference:
tsdisplay(diff(COVID_series_Lebanon_Train))
Arima model: ARIMA(1,1,2)
#Auto Arima for Italy:
ARIMA1_Lebanon<-auto.arima(COVID_series_Lebanon_Train,
xreg = COVID_2_Day_Lebanon_Temp$Temperature_Lebanon[1:74])
#ARIMA1_Mexico<-Arima(COVID_series_Mexico_Train,order=c(2,2,3),xreg=COVID_2_Day_Mexico_Temp$Temperature_Mexico[1:74])
summary(ARIMA1_Lebanon)
Series: COVID_series_Lebanon_Train
Regression with ARIMA(1,1,2) errors
Coefficients:
ar1 ma1 ma2 xreg
0.9216 -0.8504 0.4970 0.0461
s.e. 0.0488 0.0989 0.1263 0.2338
sigma^2 estimated as 60.69: log likelihood=-252.36
AIC=514.72 AICc=515.62 BIC=526.17
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 1.101941 7.522299 3.444682 NaN Inf 0.4835803 -0.03853757
Forecast prediction to compare
Test_Lebanon_Temp<-COVID_2_Day_Lebanon_Temp$Temperature_Lebanon[75:length(COVID_Day_series_Lebanon_Temp)]
P_LBN_1<-forecast(ARIMA1_Lebanon,xreg = Test_Lebanon_Temp)
Calculate errors:
RE_LBN_1<-RE(P_LBN_1$mean,as.vector(COVID_series_Lebanon_Test))
MAPE_LBN_1<-MAPE(P_LBN_1$mean,as.vector(COVID_series_Lebanon_Test))
MSE_LBN_1<-MSE(P_LBN_1$mean,as.vector(COVID_series_Lebanon_Test))
PFA_LBN_1<-PFA(P_LBN_1$mean,as.vector(COVID_series_Lebanon_Test))
Errors.LBN_1<-c(RE_LBN_1,MAPE_LBN_1,MSE_LBN_1,PFA_LBN_1)
Errors.LBN_1
[1] 0.02939284 2.81563809 465.27621325 0.37500000
Manual MAPE:
dd<-data.frame(Actual=as.vector(COVID_series_Lebanon_Test),
Fore=P_LBN_1$mean
)
dd<-dd %>% mutate(Abs_Per_Error=abs(Actual-Fore)/abs(Actual))
mean(dd$Abs_Per_Error)*100
[1] 2.815638
Model without temperature
ARIMA(0,2,2)
#Auto Arima for Lebanon:
ARIMA2_Lebanon<-auto.arima(COVID_series_Lebanon_Train)
summary(ARIMA2_Lebanon)
Series: COVID_series_Lebanon_Train
ARIMA(0,2,2)
Coefficients:
ma1 ma2
-0.8655 0.4654
s.e. 0.0939 0.1264
sigma^2 estimated as 61.55: log likelihood=-249.92
AIC=505.84 AICc=506.19 BIC=512.67
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.286841 7.630167 3.533028 2.334374 15.75454 0.4959827 -0.05944259
Forecast prediction to compare
P_LBN_2<-forecast(ARIMA2_Lebanon,h=length(COVID_series_Lebanon_Test))
Calculate errors:
RE_LBN_2<-RE(P_LBN_2$mean,as.vector(COVID_series_Lebanon_Test))
MAPE_LBN_2<-MAPE(P_LBN_2$mean,as.vector(COVID_series_Lebanon_Test))
MSE_LBN_2<-MSE(P_LBN_2$mean,as.vector(COVID_series_Lebanon_Test))
PFA_LBN_2<-PFA(P_LBN_2$mean,as.vector(COVID_series_Lebanon_Test))
Errors.LBN_2<-c(RE_LBN_2,MAPE_LBN_2,MSE_LBN_2,PFA_LBN_2)
Errors.LBN_2
[1] 0.01246393 1.23899990 62.67620214 0.50000000
Errors<-rbind(Errors.LBN_1,Errors.LBN_2)
rownames(Errors)<-c("ARIMAX(1,1,2)","ARIMA(0,2,2)")
colnames(Errors)<-c("Relative error","Mean Abs % error","Mean Squared Error","PFA")
Errors<-as.data.frame(Errors)
maximum<-apply(Errors,2,max)
minimum<-apply(Errors,2,min)
Errors<-rbind(minimum,Errors)
Errors<-rbind(maximum,Errors)
radarchart(Errors,maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8,cglcol="gray67",
pcol=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"),
plty=1,
plwd=3,
title="Error comparison")
legend <-legend(1.5,1, legend=c("ARIMAX(1,1,2)","ARIMA(0,2,2)"),
seg.len=-1.4,
title="Errors",
pch=21,
bty="n" ,lwd=3, y.intersp=1, horiz=FALSE,
col=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"))
Conclusion: Keep model without temperature
Make model with the overall series
#Auto Arima for Lebanon:
ARIMA_Final_Lebanon<-auto.arima(COVID_Day_series_Lebanon_Temp)
summary(ARIMA_Final_Lebanon)
Series: COVID_Day_series_Lebanon_Temp
ARIMA(2,2,0)
Coefficients:
ar1 ar2
-0.9293 -0.2992
s.e. 0.1071 0.1078
sigma^2 estimated as 61.63: log likelihood=-277.8
AIC=561.6 AICc=561.92 BIC=568.75
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.3459771 7.656536 3.690808 2.655543 13.12427 0.4745325 -0.02063533
Final forecast:
Future_Lebanon_Temp<-Lebanon_Temp$Temperature_Lebanon[Lebanon_Temp$Date>max(COVID_2$Date2)]
P_LBN_Final<-forecast(ARIMA_Final_Lebanon,h=length(Future_Lebanon_Temp))
Low_lim_LBN<-data.frame(P_LBN_Final$lower)[,2]
Upp_lim_LBN<-data.frame(P_LBN_Final$upper)[,2]
For making the plot:
##Data periods
per_1 <- as.Date(as.character(COVID_2_Day_Lebanon_Temp$Date2))
per_2 <- seq(as.Date(max(COVID_2_Day_Lebanon_Temp$Date2)+1,format="%Y-%m-%d"), as.Date("2020-04-30",format="%Y-%m-%d"),"1 day")
# Merge forecast + actuals
data <- xts(COVID_Day_series_Lebanon_Temp,order.by=per_1)
dataNA <- rep(NA, length(data))
A <- cbind(data,dataNA,dataNA,dataNA)
Low_lim_LBN <- xts(Low_lim_LBN,order.by=per_2)
Forecast_LBN <- xts(P_LBN_Final$mean,order.by=per_2)
Upp_lim_LBN <- xts(Upp_lim_LBN,order.by=per_2)
predNA <- rep(NA, length(Forecast_ITA))
B <- cbind(predNA, Low_lim_LBN, Forecast_LBN, Upp_lim_LBN)
all_series_LBN <- data.frame(rbind(as.matrix(A),as.matrix(B)))
colnames(all_series_LBN) <- c('Actual', 'Lower_limit', 'Forecast', 'Upper_limit')
dygraph(all_series_LBN, main="SARS-COV2-outbreak: Total Lebanon cases",xlab="Date", ylab="Novel coronavirus cases",width = 750)%>%
dySeries(c('Lower_limit', 'Forecast', 'Upper_limit'),label="Forecast",strokeWidth=2,
drawPoints = TRUE, pointSize = 2, color=rgb(218/255,55/255,50/255)) %>%
dySeries("Actual",drawPoints = TRUE, strokeWidth=2, pointSize = 2,
color=rgb(73/255,163/255,90/255)) %>%
dyRangeSelector()
NA